Aim

This analysis is to explore the best telomere mixed model to fit the CML colony data

Setup

Libraries

knitr::opts_chunk$set(collapse = TRUE)
  
library(tidyverse)
library(magrittr)
## 
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
## 
##     set_names
## The following object is masked from 'package:tidyr':
## 
##     extract
library(ggbeeswarm)
library(ggtext)
library(ggeffects)
library(ggnewscale)
library(ggh4x)
library(scales)
## 
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
## 
##     discard
## The following object is masked from 'package:readr':
## 
##     col_factor
library(patchwork)

library(lme4)
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
library(optimx)
library(dfoptim)
library(lmeresampler)

library(sjPlot)
## Learn more about sjPlot with 'browseVignettes("sjPlot")'.
library(performance)
library(parameters)

import::from(.from = "dplyr", count, select, rename, slice)

Functions

# Add totals to df
addTotal <-
  function(df) {
    df %>% mutate(across(where(is.logical), as.character)) %>%
      bind_rows(summarise(.,
                          across(where(is.numeric), sum,  na.rm = T),
                          across(where(is.character), ~ "Total")))
  }

# Lmer optimiser QC

# allfit summary 
all_fit.summary <- function(all_fit) {
  broom.mixed::glance(all_fit) %>%  select(optimizer, AIC, BIC, NLL_rel)  %>%  arrange(NLL_rel)

}


# allfit compare estimated ranked by fit metrics
all_fit.compare <- function(all_fit) {
  Reduce(left_join, list(
    broom.mixed::glance(all_fit), 
    Reduce(bind_rows, list(broom.mixed::tidy(all_fit) %>% filter(effect == "fixed"),
    lapply(all_fit, function(x){ x@optinfo$conv$opt}) %>% unlist() %>% enframe(name = "optimizer", value = "estimate") %>% mutate(term = "lme4.optimizer_code"),
    lapply(all_fit, function(x){ as.numeric(isSingular(x))}) %>% unlist() %>% enframe(name = "optimizer", value = "estimate") %>% mutate(term = "isSingular"),
    lapply(all_fit, function(x){ as.numeric(check_convergence(x))}) %>% unlist() %>% enframe(name = "optimizer", value = "estimate") %>% mutate(term = "Converged")
  ))))  %>%
  ggplot(aes(reorder(optimizer, -NLL_rel), estimate, colour = optimizer)) +
  geom_point() +
  facet_grid(0 ~ term, scale = "free_x") +
  coord_flip() + guides(colour = "none") +  
  theme(axis.text.x = element_text(angle = 30, hjust = 1, vjust = 1))

}

#  Model check
model.check <- function(model) {
  n <- rlang::expr_name(rlang::enexpr(model))
  res <- enframe(c(
    "Formula" = unlist(str_split(as.character(model@call), pattern = "\\="))[2],
    "Observations" = nrow(model@frame),
    "REML" = isREML(model),
    "Optimizer" = model@optinfo$optimizer,
    "Singular" = isSingular(model),
    "Converged" = check_convergence(model)
  )) %>%  tab_df(title = paste0("Model ", n),
                 col.header = c("Test", "Value"))
  return(res)
}

# get model formula
get_formula <- function(model) {
  str_sub(unlist(str_split(as.character(model@call), pattern = "\\="))[2])

}

Data

per_sample_statistics.n834.df <- 
  read_csv("../data/per_sample_statistics.n834.csv")
## New names:
## Rows: 834 Columns: 32
## ── Column specification
## ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── Delimiter: "," chr
## (7): Patient, Sample, tip.label, driver3, platform_unit, run_id.uniq, status dbl (25): ...1, age_at_sample_exact, meanvaf, rho.bb, meandepth, sensitivity, F1, F2, F4, Psi,
## Insert_mean, Insert_sd, Read_length, Initial_read_length, F2a, Length, Se...
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`

driver.palette <- c(`Wild-type` = "#0072B5FF", Other ="#E18727FF",  `*BCR::ABL1*`="#BC3C29FF")
driver.palette1 <- c(WT = "#0072B5FF", Other ="#E18727FF",  BCR_ABL="#BC3C29FF")
driver.palette2 <- c(Wt = "#0072B5FF", Mt="#BC3C29FF")
driver.palette2b <- c(Absence = "#717D6E", Presence="#BC3C29FF")

Setup analysis variables

# BCR::ABL1-mutated vs non mutated
per_sample_statistics.n834.df %<>%
  mutate(BCR_ABL1 = ifelse(grepl("BCR::ABL1", driver3, perl = T), "Mt", "Wt"))

#  set Wt as ref
per_sample_statistics.n834.df %<>%  mutate(BCR_ABL1 = factor(BCR_ABL1, levels = c("Wt", "Mt")))


# BCR::ABL1-mutated vs Other driver vs wildtype
per_sample_statistics.n834.df %<>% mutate(driver2 = ifelse(driver3 == "WT", "WT", ifelse(
  grepl("BCR::ABL1", driver3, perl = T),
  "BCR_ABL",
  "Other"
)))

per_sample_statistics.n834.df %<>% mutate(driver2 = factor(driver2, levels = c("WT", "Other", "BCR_ABL")))

# Set batch variable as character
per_sample_statistics.n834.df %<>% mutate(library.cluster = as.character(library.cluster))

In total 834 samples passed QC across 9 patients. Age (age_at_sample_exact) was defined as the count of completed years from birth at sampling and sample mutant status (BCR_ABL1) was defined as BCR::ABL1 positive (Mt; n=365) or negative (Wt; n=469).

per_sample_statistics.n834.df %>%
  count(Patient, BCR_ABL1) %>%
  pivot_wider(names_from = BCR_ABL1, values_from = n) %>%
  addTotal() %>%
  tab_df()
## Warning: There was 1 warning in `summarise()`.
## ℹ In argument: `across(where(is.numeric), sum, na.rm = T)`.
## Caused by warning:
## ! The `...` argument of `across()` is deprecated as of dplyr 1.1.0.
## Supply arguments directly to `.fns` through an anonymous function instead.
## 
##   # Previously
##   across(a:b, mean, na.rm = TRUE)
## 
##   # Now
##   across(a:b, \(x) mean(x, na.rm = TRUE))
Patient Wt Mt
PD51632 24 59
PD51633 77 4
PD51634 48 75
PD51635 85 32
PD56961 105 58
PD57332 NA 35
PD57333 75 NA
PD57334 44 11
PD57335 11 91
Total 469 365

Add labels

attr(per_sample_statistics.n834.df$Length, "label") <- "Mean telomere length (bp)"
attr(per_sample_statistics.n834.df$nsub_adj, "label") <- "SNV burden (depth adjusted)"
attr(per_sample_statistics.n834.df$age_at_sample_exact, "label") <- "Age (years)"
attr(per_sample_statistics.n834.df$BCR_ABL1, "label") <- "BCR::ABL1 status"
attr(per_sample_statistics.n834.df$driver2, "label") <- "Driver status"

Factors affecting telomere length estimation in NovaSeq sequencing

Mitchell et al (2022) showed that telomerecat telomere length estimates from NovaSeq sequenced samples are problematic with very high estimates and zero length estimates compared to HiSeqX sequenced samples.

Read quality of telomeric reads

We hypothesised that a likely contributing cause is the change from 4 colour (HiSeqX) to 2 colour (NovaSeq) sequencing. Therefore NovaSeq sequencing outputs a “G” when there is a “G” base and when there is an error, thus causing misattribution of telomeric reads by telomerecat.

Reads containing telomeric repeats (from telomerecat telbam) show a marked increase in “G” content after 75bp.

Adjusting Telomerecat estimates

Running Telomerecat with -t 75 “trims” read attribution to the first 75bp resulting in a decrease in outlier estimates.

Trimmed telomere length vs. age at sampling

Plotting the trimmed telomerecat mean telomere lengths (bp) vs. age at sampling cohort (n=834) shows:

  • The Wild-type trend is similar to the prior estimate from Mitchell et al.

  • In most patients the BCR::ABL1 mutated samples have shorter telomere lengths

  • There seems to be no overt difference between wild-type and Other driver samples

# Setup facet xlim
x_lim.df <-
  per_sample_statistics.n834.df %>%  group_by(Patient) %>%  summarise(min = min(age_at_sample_exact),
                                                                              max = max(age_at_sample_exact))

x_scale <- lapply(1:nrow(x_lim.df), function(x) {
  y = round(x_lim.df$min[x] + ((x_lim.df$max[x] - x_lim.df$min[x]) / 2))
  scale_x_continuous(limits = c(y - 7, y + 7), breaks = pretty_breaks())
})

names(x_scale) <- x_lim.df$Patient

per_sample_statistics.n834.df %>%
  mutate(
    Patient = factor(
      Patient,
      levels = c(
        "PD57334",
        "PD56961",
        "PD57335",
        "PD51634",
        "PD51633",
        "PD51632",
        "PD51635",
        "PD57333",
        "PD57332"
      )
    ),
    `*BCR::ABL1* status` = factor(
      BCR_ABL1,
      levels = c("Wt", "Mt"),
      labels = c("Absence", "Presence")
    ),
    `Driver status` = factor(
      driver2,
      levels = c("BCR_ABL", "WT", "Other"),
      labels = c("*BCR::ABL1*", "Wild-type", "Other")
    ),
    `Mean telomere length (bp)` = Length,
    `Age (years)` = age_at_sample_exact,
    group2 = paste0(Patient, "_", age_at_sample_exact, "_", `Driver status`)
  ) %>%
  ggplot(
    aes(
      `Age (years)`,
      `Mean telomere length (bp)`,
      colour = `Driver status`,
      group.by = group2
    )
  ) +
  geom_quasirandom(width = 1, alpha = 0.6) +
  scale_colour_manual(values = driver.palette) +
  theme_bw(base_size = 16) +
  theme(legend.title = element_markdown(), legend.text = element_markdown()) +
  facet_grid( ~ Patient, scales = "free_x") +
  guides(color = guide_legend(override.aes = list(linetype = c(rep(0, length(driver.palette)))))) +
  new_scale_colour() +
  scale_color_grey() +
  geom_abline(
    data = as_tibble(list(
      a = 4512.38235,
      b = -30.80877,
      Reference = "Mitchell et al. -30bp/year"
    )),
    aes(
      intercept = a,
      slope = b,
      colour = Reference
    ),
    lty = 2,
    show.legend = TRUE
  )  +
  facetted_pos_scales(x = x_scale[c(
    "PD57334",
    "PD56961",
    "PD57335",
    "PD51634",
    "PD51633",
    "PD51632",
    "PD51635",
    "PD57333",
    "PD57332"
  )])

Batch effects

Unlike the SNV burden analysis where we have a well formed correction for sequencing depth given sensitivity of germline SNP recall and sample VAF, telomerecat telomere length estimation is likely to incur significant batch effects.

Telomerecat estimates the mean telomere length using counts of telomeric reads (F1) and telomere boundary reads (F2_a). This makes it robust to ploidy with the assumption that F1 and F2_a reads are recalled at the same rate.

We hypothesised that this assumption could be affected by:

  1. Lower read depth - causing telomere boundary reads (F2_a) to be stochastically undersampled thereby giving an overestimate of telomere length

  2. Sequencing library preparation - causing batch to batch variation in the ratio of F1/F2a reads

Recall of telomere boundary reads

We could assume that in an diploid unbiased sample the number of telomere boundary reads (F2_a) divided by 46 should equal the sample sequencing coverage (Seq.X).

In fact we see divergences from this assumption in 3-4 of our Patients with differences across the BCR::ABL1 fusion status.

per_sample_statistics.n834.df %>%
  mutate(`*BCR::ABL1* status` = factor(
    BCR_ABL1,
    levels = c("Wt", "Mt"),
    labels = c("Absence", "Presence")
  )) %>%
  ggplot(aes(Seq.X, F2a / 46, color = `*BCR::ABL1* status`)) +
  geom_point(alpha =0.6) +
  scale_colour_manual(values = driver.palette2b) +
  theme_bw(base_size = 16) +
  theme(
    legend.title = element_markdown(),
    legend.text = element_markdown(),
    legend.position = "bottom"
  ) +
  geom_abline(intercept = 0, slope = 1) +
  facet_grid(~ Patient)

The ratio of telomere boundary reads (F2a) to the sum of all boundary reads with telomeric repeats (F2 + F4) is lower in these samples, where we would expect the ratio to be similar at the patient-level.

per_sample_statistics.n834.df %>%
  ggplot(aes(Seq.X, F2a / 46, color = F2a / (F2 + F4))) +
  geom_point() +
  scale_color_viridis_c(option = "A",
                        trans = scales::pseudo_log_trans(sigma = 0.001)) +
  theme_bw(base_size = 16) +
  theme(
    legend.position = "bottom"
  ) +  geom_abline(intercept = 0, slope = 1) +
  facet_grid( ~
                Patient)

Define candidate batch variables

We defined two candidate batch variables:

  1. The library preparation cluster (library.cluster), that is the library preparation date clustered by adjacent days (n=22)

  2. The unique sequence run ID (run_id.uniq) which is the concatenated flowcell IDs of for each sample (n=31)

With “run_id.uniq” a more granular variable which accounts for both library preparation cluster and sequencing batch.

library.cluster.panel <- 
  per_sample_statistics.n834.df %>%
  mutate(F2a_ratio = F2a / (F2 + F4)) %>%
  filter(Patient  %in%  c("PD51632", "PD56961")) %>%
  mutate(`*BCR::ABL1* status` = factor(
    BCR_ABL1,
    levels = c("Wt", "Mt"),
    labels = c("Absence", "Presence")
  )) %>%
  select(
    Patient,
    `*BCR::ABL1* status`,
    age_at_sample_exact,
    library.cluster,
    Length,
    F2a_ratio,
    Seq.X
  ) %>% pivot_longer(cols = Length:Seq.X) %>%
  mutate(name = factor(name, levels = c("Seq.X", "F2a_ratio", "Length"))) %>%
  ggplot(aes(value, library.cluster, colour = `*BCR::ABL1* status`)) +
  geom_quasirandom(width = 0.2 , alpha = 0.6) +
  scale_colour_manual(values = driver.palette2b) +
  theme_bw(base_size = 12) +
  theme(
    legend.title = element_markdown(),
    legend.text = element_markdown(),
    legend.position = "bottom"
  ) +
  facet_grid(Patient + age_at_sample_exact  ~ name, scales = "free")

run_id.uniq.panel <-
  per_sample_statistics.n834.df %>%
  mutate(F2a_ratio = F2a / (F2 + F4)) %>%
  filter(Patient  %in%  c("PD51632", "PD56961")) %>%
  mutate(`*BCR::ABL1* status` = factor(
    BCR_ABL1,
    levels = c("Wt", "Mt"),
    labels = c("Absence", "Presence")
  )) %>%
  select(
    Patient,
    `*BCR::ABL1* status`,
    age_at_sample_exact,
    run_id.uniq,
    Length,
    F2a_ratio,
    Seq.X
  ) %>% pivot_longer(cols = Length:Seq.X) %>%
  mutate(name = factor(name, levels = c("Seq.X", "F2a_ratio", "Length"))) %>%
  ggplot(aes(value, run_id.uniq, colour = `*BCR::ABL1* status`)) +
  geom_quasirandom(width = 0.2 , alpha = 0.6) +
  scale_colour_manual(values = driver.palette2b) +
  theme_bw(base_size = 12) +
  theme(
    legend.title = element_markdown(),
    legend.text = element_markdown(),
    legend.position = "bottom"
  ) +
  facet_grid(Patient + age_at_sample_exact  ~ name, scales = "free")

(library.cluster.panel / run_id.uniq.panel)  + plot_layout(guides = 'collect') &
  theme(legend.position='bottom')
## Orientation inferred to be along y-axis; override with `position_quasirandom(orientation = 'x')`
## Orientation inferred to be along y-axis; override with `position_quasirandom(orientation = 'x')`

Use of mixed models

Linear mixed models implemented in the R package “lme4” were used due to the repeated measures at the patient-level in the data set. Models were fitted with default “lme4” parameters, if a model did not converge, lme4::allFit() was used to refit the model to all available optimisers (provide by the “lme4”, “optimx”, and “dfoptim” R packages), the best optimiser was selected from non-singular and converged refits with the highest negative log-likelihood. Only non-singular and converged models were considered for model selection using the Bayesian information criterion (BIC).

Batch variable selection in wild-type only model

To identify an effective batch effect variable we restrict the data to BCR::ABL-ve samples (n=469) and compare each candidate batch effect variable to a baseline model with “age at sample” as a fixed effect and “patient” as a random effect.

Baseline model

tel_wt.lmer_0 <-
  lmer(
    Length ~  age_at_sample_exact + (1 | Patient),
    data = per_sample_statistics.n834.df%>% filter(BCR_ABL1 == "Wt"), REML = F
  )

model.check(tel_wt.lmer_0)
Model tel_wt.lmer_0
Test Value
Formula Length ~ age_at_sample_exact + (1 | Patient)
Observations 469
REML FALSE
Optimizer nloptwrap
Singular FALSE
Converged TRUE

library.cluster model

tel_wt.lmer_0b <-
  lmer(
    Length ~  age_at_sample_exact +  (1 | Patient) + (1 | library.cluster),
    data = per_sample_statistics.n834.df %>% filter(BCR_ABL1 == "Wt"), REML = F
  )

model.check(tel_wt.lmer_0b)
Model tel_wt.lmer_0b
Test Value
Formula Length ~ age_at_sample_exact + (1 | Patient) + (1 | library.cluster)
Observations 469
REML FALSE
Optimizer nloptwrap
Singular FALSE
Converged TRUE

run_id.uniq model

tel_wt.lmer_0c <-
  lmer(
    Length ~  age_at_sample_exact + (1 | Patient) + (1 |  run_id.uniq),
    data = per_sample_statistics.n834.df %>% filter(BCR_ABL1 == "Wt"), REML = F
  )

model.check(tel_wt.lmer_0c)
Model tel_wt.lmer_0c
Test Value
Formula Length ~ age_at_sample_exact + (1 | Patient) + (1 | run_id.uniq)
Observations 469
REML FALSE
Optimizer nloptwrap
Singular FALSE
Converged TRUE

Compare wild-type models

Across all models the mean telomere length attrition rates estimates (per year) are compatible with Mitchell et al. 2022, showing that NovaSeq derived telomerecat mean telomere length can be used.

Both candidate batch variables improve the baseline model’s fit to the data, with ” + (1 | run_id.uniq)” (tel.lmer_0c; BIC=7483.33) identified as the better variable over ” + (1 | library.cluster)” (tel.lmer_0b; BIC=7507.51).

tab_model(
  tel_wt.lmer_0,
  tel_wt.lmer_0b,
  tel_wt.lmer_0c,
  show.se = T,
  show.stat = T,
  show.ci = F,
  show.p = F,
  p.style = "scientific",
  dv.labels = c(
    "Baseline (tel_wt.lmer_0)",
    "library.cluster random effect (tel_wt.lmer_0b)",
    "run_id.uniq random effect (tel_wt.lmer_0c)"
  )
)
  Baseline (tel_wt.lmer_0) library.cluster random effect (tel_wt.lmer_0b) run_id.uniq random effect (tel_wt.lmer_0c)
Predictors Estimates std. Error Statistic Estimates std. Error Statistic Estimates std. Error Statistic
(Intercept) 6405.08 402.65 15.91 6630.98 429.51 15.44 6473.68 523.02 12.38
Age(years) -37.43 7.39 -5.07 -41.48 7.95 -5.21 -36.60 9.66 -3.79
Random Effects
σ2 485708.47 460160.79 422854.55
τ00 85546.75 Patient 74844.06 library.cluster 211454.29 run_id.uniq
  35940.64 Patient 17558.01 Patient
ICC 0.15 0.19 0.35
N 8 Patient 8 Patient 8 Patient
  19 library.cluster 21 run_id.uniq
Observations 469 469 469
Marginal R2 / Conditional R2 0.300 / 0.405 0.345 / 0.472 0.264 / 0.523

ANOVA

tel_wt.lmer_0 vs. tel_wt.lmer_0b
broom::tidy(anova(tel_wt.lmer_0, tel_wt.lmer_0b)) %>%
mutate(p.value = format.pval(p.value, digits = 3, eps = 1e-50)) %>% tab_df()
term npar AIC BIC logLik deviance statistic df p.value
tel_wt.lmer_0 4 7497.97 7514.57 -3744.99 7489.97 NA NA NA
tel_wt.lmer_0b 5 7486.75 7507.51 -3738.38 7476.75 13.22 1 0.000277
tel_wt.lmer_0 vs. tel_wt.lmer4_0c
broom::tidy(anova(tel_wt.lmer_0, tel_wt.lmer_0c)) %>%
mutate(p.value = format.pval(p.value, digits = 3, eps = 1e-50)) %>% tab_df()
term npar AIC BIC logLik deviance statistic df p.value
tel_wt.lmer_0 4 7497.97 7514.57 -3744.99 7489.97 NA NA NA
tel_wt.lmer_0c 5 7462.58 7483.33 -3726.29 7452.58 37.39 1 9.66e-10

Telomere model analysis

With a batch variable now selected we now look to model the effect of BCR::ABL1 fusion status on mean telomere length, we will build the model (accounting for batch) in a stepwise fashion:

  1. Confirm a BCR::ABL fusion status effect as a fixed effect only

  2. Test the addition of explanatory variables to the random effect configuration

1. Is there a BCR::ABL1 fusion status effect on telomere length?

Using the full dataset (n=834), we compare the inclusion of BCR::ABL1 fusion status as a fixed effect to a null model “Length ~ age_at_sample_exact + (1 | Patient) + (1| run_id.uniq)”.

Null model

tel.lmer2_0 <-
  lmer(
    Length ~  age_at_sample_exact + (1 | Patient) + (1| run_id.uniq),
    data = per_sample_statistics.n834.df , REML = F
  )

model.check(tel.lmer2_0)
Model tel.lmer2_0
Test Value
Formula Length ~ age_at_sample_exact + (1 | Patient) + (1 | run_id.uniq)
Observations 834
REML FALSE
Optimizer nloptwrap
Singular FALSE
Converged TRUE

Model 1 - driver as fixed effect

tel.lmer2_1 <-
  lmer(
    Length ~  age_at_sample_exact + BCR_ABL1 +  (1 | Patient) + (1| run_id.uniq),
    data = per_sample_statistics.n834.df, REML = F
  )

model.check(tel.lmer2_1)
Model tel.lmer2_1
Test Value
Formula Length ~ age_at_sample_exact + BCR_ABL1 + (1 | Patient) + (1 | run_id.uniq)
Observations 834
REML FALSE
Optimizer nloptwrap
Singular FALSE
Converged TRUE

Compare null to driver as fixed effect

Adding BCR::ABL fusion status as a fixed effect improves the model fit to the data (tel.lmer2_1; BIC=1.326602^{4}) over the null model (tel.lmer2_0; BIC=1.329903^{4}), confirming a BCR::ABL fusion status effect.

tab_model(
  tel.lmer2_0,
  tel.lmer2_1,
  show.se = T,
  show.stat = T,
  show.ci = F,
  show.p = F,
  p.style = "scientific",
  dv.labels = c("Baseline (SNV.lmer_0)", "Driver as fixed effect (SNV.lmer2_1)")
)
  Baseline (SNV.lmer_0) Driver as fixed effect (SNV.lmer2_1)
Predictors Estimates std. Error Statistic Estimates std. Error Statistic
(Intercept) 5103.59 470.05 10.86 5746.17 472.78 12.15
Age(years) -14.43 8.54 -1.69 -21.58 8.46 -2.55
BCR::ABL 1 status: Mt -627.53 97.07 -6.47
Random Effects
σ2 426493.29 409590.95
τ00 333723.54 run_id.uniq 232305.56 run_id.uniq
26004.99 Patient 47772.75 Patient
ICC 0.46 0.41
N 9 Patient 9 Patient
31 run_id.uniq 31 run_id.uniq
Observations 834 834
Marginal R2 / Conditional R2 0.044 / 0.481 0.185 / 0.516

ANOVA

broom::tidy(anova(tel.lmer2_0, tel.lmer2_1)) %>%
mutate(p.value = format.pval(p.value, digits = 3, eps = 1e-50)) %>% tab_df()
term npar AIC BIC logLik deviance statistic df p.value
tel.lmer2_0 5 13275.40 13299.03 -6632.70 13265.40 NA NA NA
tel.lmer2_1 6 13237.67 13266.02 -6612.83 13225.67 39.73 1 2.92e-10

2. Random effect configuration

We now want to improve the baseline model (tel.lmer2_1), so we test whether “Patient” has an effect on the slope as well as the intercept in 3 models;

  • Model 2 - age at sampling
  • Model 3 - driver status

Model 2

tel.lmer2_2 <-
  lmer(
    Length ~  age_at_sample_exact + BCR_ABL1 + (1 + age_at_sample_exact | Patient) + (1 | run_id.uniq),
    data = per_sample_statistics.n834.df,
    REML = F
  )
## boundary (singular) fit: see help('isSingular')

model.check(tel.lmer2_2)
Model tel.lmer2_2
Test Value
Formula Length ~ age_at_sample_exact + BCR_ABL1 + (1 + age_at_sample_exact | Patient) + (1 | run_id.uniq)
Observations 834
REML FALSE
Optimizer nloptwrap
Singular TRUE
Converged FALSE

Model 3

tel.lmer2_3 <-
  lmer(
    Length ~  age_at_sample_exact + BCR_ABL1 + (1 + BCR_ABL1 | Patient) + (1 | run_id.uniq),
    data = per_sample_statistics.n834.df, REML = F
  )

model.check(tel.lmer2_3)
Model tel.lmer2_3
Test Value
Formula Length ~ age_at_sample_exact + BCR_ABL1 + (1 + BCR_ABL1 | Patient) + (1 | run_id.uniq)
Observations 834
REML FALSE
Optimizer nloptwrap
Singular FALSE
Converged TRUE

Model comparison

Model 2 was singular suggesting the random effect structure is too complex for the data and was dropped from the comparison to the baseline model (tel.lmer2_1:

  • tel.lmer2_3: Length ~ age_at_sample_exact + BCR_ABL1 + (1 + BCR_ABL1 | Patient) + (1 | run_id.uniq)

Incorporation of BCR::ABL fusion status into the patient-level random effect improves model fit to the data (tel.lmer2_3; BIC=13265.15) over the baseline model (tel.lmer2_1; BIC=13266.02).

tab_model(
  tel.lmer2_1,
  tel.lmer2_3,
  show.se = T,
  show.stat = T,
  show.ci = F,
  show.p = F,
  p.style = "scientific",
  dv.labels = c(
    "Baseline (tel.lmer2_1)",
    "Random driver effect and intercept: (tel.lmer2_3)"
  )
) 
  Baseline (tel.lmer2_1) Random driver effect and intercept: (tel.lmer2_3)
Predictors Estimates std. Error Statistic Estimates std. Error Statistic
(Intercept) 5746.17 472.78 12.15 6221.92 412.72 15.08
Age(years) -21.58 8.46 -2.55 -31.57 7.56 -4.17
BCR::ABL 1 status: Mt -627.53 97.07 -6.47 -550.88 209.88 -2.62
Random Effects
σ2 409590.95 399535.37
τ00 232305.56 run_id.uniq 206550.37 run_id.uniq
47772.75 Patient 12534.25 Patient
τ11   264196.09 Patient.BCR_ABL1Mt
ρ01   -0.41 Patient
ICC 0.41 0.44
N 9 Patient 9 Patient
31 run_id.uniq 31 run_id.uniq
Observations 834 834
Marginal R2 / Conditional R2 0.185 / 0.516 0.236 / 0.572

ANOVA

tel.lmer2_1 vs. tel.lmer2_3
broom::tidy(anova(tel.lmer2_1, tel.lmer2_3)) %>%
mutate(p.value = format.pval(p.value, digits = 3, eps = 1e-50)) %>% tab_df()
term npar AIC BIC logLik deviance statistic df p.value
tel.lmer2_1 6 13237.67 13266.02 -6612.83 13225.67 NA NA NA
tel.lmer2_3 8 13227.34 13265.15 -6605.67 13211.34 14.33 2 0.000774

3. Final model

Refit with REML

tel.lmer2_3.REML <-
  update(
    tel.lmer2_3,
    REML = T,
    control = lmerControl(
      optimizer = tel.lmer2_3@optinfo$optimizer,
      optCtrl = tel.lmer2_3@optinfo$control
    )
  )

model.check(tel.lmer2_3.REML)
Model tel.lmer2_3.REML
Test Value
Formula Length ~ age_at_sample_exact + BCR_ABL1 + (1 + BCR_ABL1 | Patient) + (1 | run_id.uniq)
Observations 834
REML TRUE
Optimizer nloptwrap
Singular FALSE
Converged TRUE

Check model

check_model(tel.lmer2_3.REML)

Model summary

tab_model(
  tel.lmer2_3.REML,
  show.se = T,
  show.stat = T,
  show.ci = F,
  show.p = F,
  p.style = "scientific"
)
  Mean telomere length(bp)
Predictors Estimates std. Error Statistic
(Intercept) 6179.88 444.18 13.91
Age(years) -30.79 8.15 -3.78
BCR::ABL 1 status: Mt -556.90 225.06 -2.47
Random Effects
σ2 399685.76
τ00 run_id.uniq 211135.35
τ00 Patient 24643.27
τ11 Patient.BCR_ABL1Mt 311111.95
ρ01 Patient -0.33
ICC 0.46
N Patient 9
N run_id.uniq 31
Observations 834
Marginal R2 / Conditional R2 0.223 / 0.584

Bootstrapped fixed effects CIs

set.seed(1234)

tel.lmer2_3_par_boot <-
  bootstrap(
    tel.lmer2_3.REML,
    .f = fixef,
    type = "parametric",
    B = 3000
  )
tel.lmer2_3_par_boot
## Bootstrap type: parametric 
## 
## Number of resamples: 3000 
## 
##                  term   observed   rep.mean         se        bias
## 1         (Intercept) 6179.87522 6167.39458 485.735226 -12.4806434
## 2 age_at_sample_exact  -30.78562  -30.52931   8.951039   0.2563043
## 3          BCR_ABL1Mt -556.89988 -555.87460 227.158700   1.0252743
## 
## There were 1370 messages, 0 warnings, and 0 errors.
## 
## The most commonly occurring message was: boundary (singular) fit: see help('isSingular')

Restricted to first 1000/3000 bootstrapped models

tel.lmer2_3_par_boot2 <- tel.lmer2_3_par_boot
tel.lmer2_3_par_boot2$replicates <- tel.lmer2_3_par_boot$replicates[1:1000, ]
lmeresampler:::confint.lmeresamp(tel.lmer2_3_par_boot, type = "perc") %>% tab_df()
term estimate lower upper type level
(Intercept) 6179.88 5207.41 7098.60 perc 0.95
age_at_sample_exact -30.79 -47.83 -13.24 perc 0.95
BCR_ABL1Mt -556.90 -988.32 -103.54 perc 0.95
keep.lgl.vec <- sapply(tel.lmer2_3_par_boot$message, is.null) & sapply(tel.lmer2_3_par_boot$warning, is.null) & sapply(tel.lmer2_3_par_boot$error, is.null)

Restricted to first 1000/1566 converged non-singular bootstrapped models

tel.lmer2_3_par_boot3 <- tel.lmer2_3_par_boot
tel.lmer2_3_par_boot3$replicates <- tel.lmer2_3_par_boot$replicates[keep.lgl.vec, ][1:1000,]
lmeresampler:::confint.lmeresamp(tel.lmer2_3_par_boot3, type = "perc") %>% tab_df()
term estimate lower upper type level
(Intercept) 6179.88 5216.22 7086.12 perc 0.95
age_at_sample_exact -30.79 -48.16 -12.59 perc 0.95
BCR_ABL1Mt -556.90 -993.16 -85.98 perc 0.95

Plot final model


d <-
  ggpredict(
    model = tel.lmer2_3.REML,
    terms = c("age_at_sample_exact [0:90 by=5]", "BCR_ABL1"),
    ci.lvl = 0.95,
    type = "fe"
  )

tel_final <-
d %>% as_tibble() %>%
  mutate(group = case_when(group == "Wt" ~ "Absence", group == "Mt" ~ "Presence")) %>%
  mutate(`*BCR::ABL1* status` := group) %>%
  ggplot(aes(x = x,
             y = predicted,
             group = `*BCR::ABL1* status`,)) +
  geom_line(aes(colour = `*BCR::ABL1* status`)) +
  geom_ribbon(aes(ymin = conf.low,
                  ymax = conf.high,
                  fill  = `*BCR::ABL1* status`),
              alpha = 0.2) +
  scale_colour_manual(name = "*BCR::ABL1* status", values = driver.palette2b) +
  scale_fill_manual(name = "*BCR::ABL1* status", values = driver.palette2b) +
  new_scale_fill() +
  new_scale_colour() +
  geom_quasirandom(
    data = per_sample_statistics.n834.df %>%
      mutate(
        response = Length,
        x = age_at_sample_exact,
        group = driver2
      ) %>% mutate(`Driver status` = factor(
        driver2,
        levels = c("BCR_ABL", "WT", "Other"),
        labels = c("*BCR::ABL1*", "Wild-type", "Other")
      )) %>% mutate(`*BCR::ABL1* status` := group),
    aes(x = x, y = response, colour = `Driver status`),
    alpha = 0.5
  ) +
  scale_colour_manual(values = driver.palette) +
  # scale_fill_manual(values = driver.palette) +
  xlab(get_x_title(d)) +
  ylab(get_y_title(d)) +
  guides(colour = guide_legend(override.aes = list(linetype = c(rep(
    0, length(driver.palette)
  ))))) +
  new_scale_colour() +
  geom_abline(
    data = as_tibble(
      list(
        a = 4512.38235,
        b = -30.80877,
        Reference = "Mitchell et al. -30bp/year"
      )
    ),
    aes(
      intercept = a,
      slope = b,
      colour = Reference
    ),
    lty = 2,
    show.legend = TRUE
  ) +
  scale_colour_manual(values = "black") +
  ggtitle(unlist(str_split(
    as.character(tel.lmer2_3.REML@call), pattern = "\\="
  ))[2]) +
  theme_bw(base_size = 16) +
  theme(legend.title = element_markdown(), legend.text = element_markdown()) 

tel_final  

Figure 3g

Per patient estimated mean telomere length (bp) by age of sampling, annotated by driver status and mixed model trend lines.

Figure 3h

Per sample dot plot describing mean telomere length (bp) by sample clonal status for PD51635, faceted by time of sampling and annotated by per sample driver status. Clonal status was classified as samples with either BCR::ABL1, Other driver or coalescence after the first 100 mutations.


tel_length <- sym("Length")
tel_length_label <- attr(per_sample_statistics.n834.df$Length, "label") 

Figure_3h.dat <-
  per_sample_statistics.n834.df %>%
  filter(Patient == "PD51635", !is.na(status)) %>%
  mutate(CH = ifelse(status == "NOCH" & driver3 != "WT", "CH", status)) %>% 
  mutate(
    group = paste0(Patient, CH),
    Patient = factor(
      Patient,
      levels = c(
        "PD57334",
        "PD56961",
        "PD57335",
        "PD51634",
        "PD51633",
        "PD51632",
        "PD51635",
        "PD57333",
        "PD57332"
      )
    ),
    `*BCR::ABL1* status` = factor(
      BCR_ABL1,
      levels = c("Wt", "Mt", "Normal"),
      labels = c("Absence", "Presence", "Normal")
    ),
     `Driver status` = factor(
      driver2,
      levels = c("BCR_ABL", "Other", "WT"),
      labels = c("*BCR::ABL1*", "Other", "Wild-type")
    ),
    `Clonal status` = factor(
      CH,
      levels = c("CH", "NOCH"),
      labels = c("+", "-")
    ),
    {
      {
        tel_length_label
      }
    } := !!tel_length,
    `Mutations SNV (adjusted)` = nsub_adj,
    `Age (years)` = age_at_sample_exact
  ) 
  
Figure_3h <- 
  Figure_3h.dat %>% 
    ggplot(aes(
      `Clonal status`,
      !!sym(tel_length_label),
      colour = `Driver status`,
      group.by = group,
    )) +
    geom_quasirandom(width = 0.4, alpha = 0.5) +
    scale_colour_manual(values = driver.palette) +
    theme_bw(base_size = 16) +
    theme(legend.text = element_markdown()) +
    facet_nested(~ Patient + `Age (years)`, scales = "free_x") +
    force_panelsizes(cols = c(1, 2)) + theme(panel.spacing = unit(0, "lines"))

Figure_3h

ggsave(filename =  paste0("../figures/v1/", "figure_3h_V1.pdf"), plot = Figure_3h, width = 4.5, height =4.5 )

SessionInfo

sessionInfo()
## R version 4.1.3 (2022-03-10)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS 15.2
## 
## Matrix products: default
## LAPACK: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] parameters_0.21.1  performance_0.10.4 sjPlot_2.8.14      lmeresampler_0.2.4 dfoptim_2023.1.0   optimx_2023-8.13   lme4_1.1-33        Matrix_1.5-1      
##  [9] patchwork_1.2.0    scales_1.3.0       ggh4x_0.3.0        ggnewscale_0.5.0   ggeffects_1.2.2    ggtext_0.1.2       ggbeeswarm_0.7.2   magrittr_2.0.3    
## [17] bookdown_0.33      magick_2.7.4       rvest_1.0.3        lubridate_1.9.2    forcats_1.0.0      stringr_1.5.1      dplyr_1.1.2        purrr_1.0.1       
## [25] readr_2.1.4        tidyr_1.3.0        tibble_3.2.1       ggplot2_3.5.1      tidyverse_2.0.0   
## 
## loaded via a namespace (and not attached):
##   [1] backports_1.4.1      HLMdiag_0.5.0        Hmisc_5.1-0          systemfonts_1.0.4    plyr_1.8.8           splines_4.1.3        TH.data_1.1-2       
##   [8] digest_0.6.31        import_1.3.0         htmltools_0.5.5      fansi_1.0.4          checkmate_2.2.0      cluster_2.1.2        see_0.8.0           
##  [15] tzdb_0.3.0           modelr_0.1.10        vroom_1.6.1          sandwich_3.0-2       timechange_0.2.0     colorspace_2.1-0     ggdist_3.2.1        
##  [22] textshaping_0.3.6    haven_2.5.2          xfun_0.39            crayon_1.5.2         jsonlite_1.8.8       survival_3.2-13      zoo_1.8-12          
##  [29] glue_1.6.2           gtable_0.3.4         emmeans_1.8.7        sjstats_0.18.2       sjmisc_2.8.9         distributional_0.3.2 mvtnorm_1.1-3       
##  [36] Rcpp_1.0.10          viridisLite_0.4.2    xtable_1.8-4         gridtext_0.1.5       htmlTable_2.4.2      foreign_0.8-82       bit_4.0.5           
##  [43] Formula_1.2-5        datawizard_0.7.1     htmlwidgets_1.6.2    httr_1.4.5           ellipsis_0.3.2       pkgconfig_2.0.3      farver_2.1.1        
##  [50] nnet_7.3-17          sass_0.4.7           utf8_1.2.3           janitor_2.2.0        tidyselect_1.2.0     labeling_0.4.3       rlang_1.1.1         
##  [57] reshape2_1.4.4       effectsize_0.8.3     munsell_0.5.0        nlmeU_0.70-9         tools_4.1.3          cachem_1.0.8         cli_3.6.1           
##  [64] generics_0.1.3       sjlabelled_1.2.0     broom_1.0.4          evaluate_0.23        fastmap_1.1.1        ragg_1.2.5           yaml_2.3.7          
##  [71] knitr_1.45           bit64_4.0.5          nlme_3.1-155         pracma_2.4.2         xml2_1.3.3           compiler_4.1.3       rstudioapi_0.14     
##  [78] beeswarm_0.4.0       statmod_1.5.0        bslib_0.4.2          stringi_1.8.2        highr_0.10           lattice_0.21-8       commonmark_1.9.0    
##  [85] diagonals_6.4.0      nloptr_2.0.3         markdown_1.11        vctrs_0.6.2          pillar_1.9.0         lifecycle_1.0.4      jquerylib_0.1.4     
##  [92] estimability_1.4.1   data.table_1.14.8    insight_0.19.2       R6_2.5.1             gridExtra_2.3        vipor_0.4.5          codetools_0.2-18    
##  [99] boot_1.3-28          MASS_7.3-60          withr_2.5.2          multcomp_1.4-25      mgcv_1.8-39          bayestestR_0.13.1    parallel_4.1.3      
## [106] hms_1.1.2            grid_4.1.3           rpart_4.1.16         coda_0.19-4          minqa_1.2.5          rmarkdown_2.25       snakecase_0.11.0    
## [113] numDeriv_2016.8-1.1  base64enc_0.1-3